1 Load packages

set.seed(132)

require(readr)
require(rsample)
require(skimr)
require(tidymodels)
require(kernlab)
require(dplyr)
require(purrr)
require(yardstick)
require(tictoc)
require(parsnip)
require(furrr)
require(gridExtra)
require(grid)
require(mlbench)
require(probably)
require(ROSE)
require(discrim)
require(modelStudio)
require(janitor)
require(DataExplorer)
require(patchwork)
library(ggforce)

require(reshape2)
require(ggplot2)
require(future)
require(tidyverse)
require(future.apply)
require(glmnet)
require(data.table)

require(survival)
require(ranger)
require(prodlim)
require(riskRegression)
require(pec)
require(survminer)
require(ggcorrplot)
require(corrr)
library(doFuture)

2 Look at data

3 SuddenCardiacDeath

PivotData2a <- Data2a %>% 
  select(-c( primary_ep,secondary_ep)) %>% 
  pivot_longer(-c(sudden_cd,id), names_to = "Parameters", values_to = "Values") 

PivotData2a$sudden_cd <- as.factor(PivotData2a$sudden_cd )

for(i in 1:10){

  print(

    ggplot(

    PivotData2a, aes(Values, colour = sudden_cd ))+
     geom_boxplot ()+
     facet_wrap_paginate(Parameters~., nrow=4, ncol = 4,page=i, scale="free")+
     labs(title="Density comparison between label") +  theme(text = element_text(size=8)) +
                                                               theme_bw()
  )

}

Data <- Data2a %>% 
  mutate(Label = case_when(sudden_cd == "1" ~ "Yes", TRUE ~ "No" )) %>% 
  select(-c(primary_ep,sudden_cd, secondary_ep, id)) %>% 
  drop_na()

3.1 Scale and filter linear combinations - Final Dataset

   Data <- Data %>% 
  dplyr::rename(Time = time)

   PreProc <- recipe(Label ~ . , data = Data) %>%
    step_normalize(all_numeric(), -Time) %>%
    step_nzv(all_predictors()) #%>%
  
  PreparedPreProc <-  PreProc  %>% 
    prep()

    invisible(x1 <- Data %>% 
    select_if(is.numeric) %>%
    correlate() %>%
    rearrange() %>%
    shave())
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
    rplot(x1)
## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.

3.2 Parameters pipeline

source("FunctionsCox2.R") #have to make sure ggstatsplot is the first package loaded

numPartitions <- 400 #number of permutations
method <- 1 #1 - classification, 2 -  regression, 3 - multiclassification
#alpha values (0.5,1) - EN and LASSO
alpha <- c(1,0.5)  #could add more/different alphas to glmnet algorithms
All2 <- Data
FinalAll2F <- data.frame(All2)
All2F <- FinalAll2F

Surv study

## Call: survfit(formula = SurvTimes ~ 1)
## 
##        n events rmean* se(rmean) median 0.95LCL 0.95UCL
## [1,] 371     27   79.5      1.47     NA      NA      NA
##     * restricted mean with upper limit =  87
## Call: survfit(formula = SurvTimes ~ 1)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    371       5    0.987 0.00599        0.975        0.998
##     4    364       2    0.981 0.00707        0.967        0.995
##     9    360       1    0.978 0.00756        0.964        0.993
##    10    359       1    0.976 0.00802        0.960        0.991
##    11    358       1    0.973 0.00845        0.957        0.990
##    13    342       4    0.962 0.01008        0.942        0.982
##    14    334       1    0.959 0.01046        0.938        0.979
##    16    320       1    0.956 0.01084        0.935        0.977
##    19    292       2    0.949 0.01172        0.926        0.972
##    21    271       2    0.942 0.01263        0.918        0.967
##    29    188       1    0.937 0.01352        0.911        0.964
##    31    168       1    0.932 0.01455        0.903        0.960
##    35    139       1    0.925 0.01591        0.894        0.957
##    37    119       1    0.917 0.01757        0.883        0.952
##    42     91       1    0.907 0.02006        0.868        0.947
##    70     27       1    0.873 0.03821        0.802        0.952
##    76     14       1    0.811 0.06980        0.685        0.960
## Warning: Strategy 'multiprocess' is deprecated in future (>= 1.20.0). Instead,
## explicitly specify either 'multisession' or 'multicore'. In the current R
## session, 'multiprocess' equals 'multisession'.
## Warning in supportsMulticoreAndRStudio(...): [ONE-TIME WARNING] Forked
## processing ('multicore') is not supported when running R from RStudio
## because it is considered unstable. For more details, how to control forked
## processing or not, and how to silence this warning in future R sessions, see ?
## parallelly::supportsMulticore

3.3 Functions - Regularization

FinalAll2F$Label <- as.factor(FinalAll2F$Label)

set.seed(132)

sInitial3 <- lapply(1:numPartitions, function(i) { 
  
  smp_size <- floor(0.85 * nrow(FinalAll2F))  

#cat(sprintf(" \n \n Iteration (%i/%i)" , i, numPartitions))
  train_ind <-  sample(seq_len(nrow(FinalAll2F)), size = smp_size)
  
  list(Model=bake(PreparedPreProc, new_data = FinalAll2F[train_ind, ]),
       Validation=bake(PreparedPreProc, new_data=FinalAll2F[-train_ind, ]))

})


sInitial3 <- ImbalClass(sInitial3, c("Model","Validation"),0) #This function is needed to take out those test and train sets that have 0 or just 1 sample of class representation
## [1] "Model"
## [1] "Validation"
sFS <- lapply(seq_along(sInitial3), function(i) {  #Exactly same info as sInitial3 but different format needed for EN and LASSO algorithms
 
  train <-sInitial3[[i]][["Model"]]
  test <- sInitial3[[i]][["Validation"]]
  list(xtrain=as.matrix(train[,-c(dim(train)[2],(dim(train)[2]-1))]),ytrain = data.frame(Label=as.numeric(train$Label)-1,Time = train$Time),
       xtest = as.matrix(test[,-c(dim(train)[2],(dim(train)[2]-1))]),ytest = data.frame(Label = as.numeric(test$Label)-1,Time = test$Time))
  
})

sTog <- list(sFS,sFS) #there was another partition - but made no sense - useful if want to do a train and test further partition
a <- sTog
PermutedLength <- lapply(a,length)
print(paste0("Wanted ", numPartitions, " and obtained permuted length of: ", PermutedLength[[1]], " in Feature Selection and ", PermutedLength[[2]] ," in Machine Learning data"))
## [1] "Wanted 400 and obtained permuted length of: 372 in Feature Selection and 372 in Machine Learning data"
LassoEn1NewCox <- function(a1,alpha,method){
  
  aa <- future_lapply(seq_along(a1),  function(i) {cv.glmnet(a1[[i]]$xtrain, Surv(a1[[i]]$ytrain$Time,a1[[i]]$ytrain$Label), family="cox", maxit = 1000, alpha = alpha)},future.seed = 132)
  
  aa.fit <- future_lapply(seq_along(a1),  function(i) {glmnet(a1[[i]]$xtrain, Surv(a1[[i]]$ytrain$Time,a1[[i]]$ytrain$Label), family="cox", maxit = 1000, alpha = alpha ,lambda=aa[[i]][["lambda.min"]])})    
  
Cval <-  future_sapply(seq_along(aa.fit),  function(i) {assess.glmnet(aa.fit[[i]],           #in this case, we are evaluating the model
              newx = a1[[i]]$xtrain,              #in the same data used to fit the model
              newy = Surv(a1[[i]]$ytrain$Time,a1[[i]]$ytrain$Label), family = "cox" )$C[[1]]})
  
 
  BetasAndNames <- lapply(seq_along(aa.fit),function(i) {q <- data.frame(Betas=data.table(as.matrix(aa.fit[[i]][["beta"]])))
  q <- data.frame(Add= rowSums(q != 0), Names=aa.fit[[i]][["beta"]]@Dimnames[[1]]) })
  sEN <- do.call("rbind",BetasAndNames)
  names(sEN) <- c("Importance","Names")
  sEN <- sEN[sEN$Importance != 0,] #take out all 0s - sparsity
  return(list(sEN, Cval))
}                                                  

3.4 Functions - Random Forest

3.5 Results

3.6 Visualization feature selection

pdd4 <- lapply(c(1), function(j) {
  lapply(c(1:2), function(i) {
    ggplot(mEN[[j]][[i]], aes(reorder(x,+freq),freq,fill=BarLab))+
      geom_bar(stat="identity")+
      theme(axis.text.x = element_text(size=8),legend.position="none")+
      labs(x="Features",y="Frequency")+
      ggtitle(paste0(names(FinalEN[[1]])[i]))+ 
      theme(panel.background = element_blank(),panel.grid.minor = element_line(colour="black"),axis.line = element_line(colour = "black"))+
      coord_flip()
    })
  })

ElasticNet <- data.frame(Names=FinalEN[[1]][["EN"]][["Names"]], Freq = FinalEN[[1]][["EN"]][["Importance"]], Model = "EN")
Lasso <- data.frame(Names=FinalEN[[1]][["LASSO"]][["Names"]], Freq = FinalEN[[1]][["LASSO"]][["Importance"]], Model = "LASSO")

FinalModelSelected <- invisible(full_join(ElasticNet,Lasso))
## Joining, by = c("Names", "Freq", "Model")
FinalModelSelected <- invisible(full_join(FinalModelSelected,RfSumFin))
## Joining, by = c("Names", "Freq", "Model")
FinalModelSelected <- FinalModelSelected %>% mutate_if(is.character, as.factor)
FinalModelSelected2 <- invisible(melt(FinalModelSelected))
## Using Names, Model as id variables
FeatsAll <- ggplot(
  FinalModelSelected2, aes(reorder(Names,-value), value ,group=Model, fill=Model))+
  geom_bar(stat="identity",width=.7, position = "dodge")+
  theme(axis.text.x = element_text(size=10, angle=90))+
        labs(title= paste0("Selected features") ,y=(sprintf(" Frequency & Ranking (out of %i)",PermutedLength[[1]])),x=("Features"))+
  theme(panel.background = element_blank(),panel.grid.minor = element_line(colour="black"),axis.line = element_line(colour = "black")
        )

pdf("FeatsAll.pdf", 15, 7)
print(FeatsAll)
dev.off()
## quartz_off_screen 
##                 2
## Warning: Removed 4543 rows containing missing values (geom_bar).

3.7 Selected variables and correlation

Lass <- pdd4[[1]][[1]]
En <- pdd4[[1]][[2]]

(Lass + En + pdd4RF)

pdf("Models.pdf", 15, 18)

print(Lass + En + pdd4RF)

dev.off()
## quartz_off_screen 
##                 2

## quartz_off_screen 
##                 2
##  [1] "homogeneity_1" "lbp_19"        "moment1"       "var"          
##  [5] "contrast_1"    "correlation_1" "energy_1"      "lgre_3"       
##  [9] "lre_1"         "gln_1"         "hgre_1"

3.8 C-index calculation - training

https://thomaselove.github.io/432-notes/cox-regression-models-for-survival-data-example-2.html https://stats.stackexchange.com/questions/48298/computing-c-index-for-an-external-validation-of-a-cox-ph-model-with-r https://stats.stackexchange.com/questions/444208/cox-ph-r-rms-package-concordance-train-cv-test

## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:parsnip':
## 
##     translate
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## 
## Attaching package: 'rms'
## The following object is masked _by_ '.GlobalEnv':
## 
##     Design
## Warning in coxph.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Loglik converged before variable 2 ; coefficient may be infinite.

3.9 Visualization C-index

## Warning in melt(Single): The melt generic in data.table has been passed a
## data.frame and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(Single). In the next version, this warning will become an error.
## No id variables; using all as measure variables

## TableGrob (2 x 1) "arrange": 2 grobs
##   z     cells    name                  grob
## 1 1 (2-2,1-1) arrange    gtable[rowhead-fg]
## 2 2 (1-1,1-1) arrange text[GRID.text.66844]
## quartz_off_screen 
##                 2
##  [1] "lre_1"         "gln_1"         "hgre_1"        "lgre_3"       
##  [5] "lbp_19"        "contrast_1"    "correlation_1" "energy_1"     
##  [9] "homogeneity_1" "var"           "moment1"

4 Perfromance LASSO, EN and RF

##   Performance_LASSO_Mean Performance_LASSO_SD Performance_EN_Mean
## 1              0.6969878           0.06172307           0.7079199
##   Performance_EN_SD PredictionError_RF_Mean PredictionError_RF_SD
## 1         0.0482745               0.4577323             0.0481219